This document will explore the DAPC analyses for the data given different populations.
library(PramCurry)
## Loading required package: poppr
## Loading required package: adegenet
## Loading required package: ade4
## ==========================
## adegenet 1.4-2 is loaded
## ==========================
##
## - to start, type '?adegenet'
## - to browse adegenet website, type 'adegenetWeb()'
## - to post questions/comments: adegenet-forum@lists.r-forge.r-project.org
##
##
## This is poppr version 1.1.2.99.70. To get started, type package?poppr
library(reshape2)
library(ggplot2)
library(ape)
library(poppr)
library(adegenet)
library(igraph)
##
## Attaching package: 'igraph'
##
## The following object is masked from 'package:ape':
##
## edges
options(stringsAsFactors = FALSE)
data(ramdat)
data(pop_data)
data(myPal)
sessionInfo()
## R version 3.1.2 (2014-10-31)
## Platform: x86_64-apple-darwin13.4.0 (64-bit)
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] igraph_0.7.1 ape_3.1-4.5 ggplot2_1.0.0 reshape2_1.4
## [5] PramCurry_1.0 poppr_1.1.2.99-70 adegenet_1.4-2 ade4_1.6-2
##
## loaded via a namespace (and not attached):
## [1] assertthat_0.1 bitops_1.0-6 boot_1.3-11
## [4] caTools_1.17.1 colorspace_1.2-4 digest_0.6.4
## [7] dplyr_0.2 evaluate_0.5.5 fastmatch_1.0-4
## [10] formatR_1.0 gdata_2.13.3 ggmap_2.3
## [13] grid_3.1.2 gtable_0.1.2 gtools_3.4.1
## [16] htmltools_0.2.6 httpuv_1.3.0 knitr_1.6
## [19] labtools_0.4.1 lattice_0.20-29 lubridate_1.3.3
## [22] mapproj_1.2-2 maps_2.3-9 MASS_7.3-34
## [25] Matrix_1.1-4 memoise_0.2.1 munsell_0.4.2
## [28] nlme_3.1-117 parallel_3.1.2 pegas_0.6
## [31] permute_0.8-3 phangorn_1.99-7 plotrix_3.5-7
## [34] plyr_1.8.1 png_0.1-7 proto_0.3-10
## [37] Rcpp_0.11.2 RgoogleMaps_1.2.0.6 rjson_0.2.14
## [40] RJSONIO_1.3-0 rmarkdown_0.3.10 scales_0.2.4
## [43] shiny_0.10.1 stringr_0.6.2 tools_3.1.2
## [46] vegan_2.0-10 xtable_1.7-4 yaml_2.1.13
# plot_posterior <- function(da.object, gid, pal){
# posterior <- da.object$posterior
# names(dimnames(posterior)) <- c("sample", "population")
# to_merge <- data.frame(list(sample = dimnames(posterior)$sample,
# oldPopulation = pop(gid)))
# post <- melt(posterior, value.name = "probability")
# post <- merge(post, to_merge)
# if (is.numeric(post$sample)){
# post$sample <- factor(post$sample, levels = unique(post$sample))
# }
# outPlot <- ggplot(post, aes(x = sample, fill = population, y = probability)) +
# geom_bar(stat = "identity", position = "fill", width = 1) +
# theme_classic() +
# theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
# scale_y_continuous(expand = c(0, 0)) +
# scale_x_discrete(expand = c(0, 0)) +
# facet_wrap(~oldPopulation, scales = "free_x", drop = TRUE, ncol = 1) +
# scale_fill_manual(values = pal)
# return(outPlot)
# }
I have originally performed these by utilizing the adegenetServer() function. Since it is interactive, I will only post the code to do it here:
setpop(ramdat) <- ~ZONE2
save(ramdat, file = "zone2.rda")
adegenetServer()
This will give me the tools to look at things like the composition plot and the cross-validation plot.
setpop(ramdat) <- ~ZONE2
xval <- xvalDapc(ramdat@tab, pop(ramdat), n.pca.max = 15, result = "overall", n.rep = 1000)
## NULL
xval[-1]
## $`Median and Confidence Interval for Random Chance`
## 2.5% 50% 97.5%
## 0.1178 0.1421 0.1772
##
## $`Mean Successful Assignment by Number of PCs of PCA`
## 1 2 3 4 5 6 7 8 9 10
## 0.6187 0.6910 0.7274 0.7281 0.7369 0.7620 0.7692 0.7819 0.7837 0.7795
## 11 12 13 14 15
## 0.7789 0.8033 0.8010 0.7958 0.7977
##
## $`Number of PCs Achieving Highest Mean Success`
## [1] "12"
##
## $`Root Mean Squared Error by Number of PCs of PCA`
## 1 2 3 4 5 6 7 8 9 10
## 0.3821 0.3107 0.2746 0.2738 0.2652 0.2397 0.2327 0.2200 0.2182 0.2224
## 11 12 13 14 15
## 0.2229 0.1987 0.2011 0.2062 0.2043
##
## $`Number of PCs Achieving Lowest MSE`
## [1] "12"
##
## $DAPC
## #################################################
## # Discriminant Analysis of Principal Components #
## #################################################
## class: dapc
## $call: dapc.data.frame(x = as.data.frame(x), grp = ..1, n.pca = ..2,
## n.da = ..3)
##
## $n.pca: 12 first PCs of PCA used
## $n.da: 6 discriminant functions saved
## $var (proportion of conserved variance): 0.976
##
## $eig (eigenvalues): 2022 110.1 57.13 49.19 17.73 ...
##
## vector length content
## 1 $eig 6 eigenvalues
## 2 $grp 513 prior group assignment
## 3 $prior 7 prior group probabilities
## 4 $assign 513 posterior group assignment
## 5 $pca.cent 31 centring vector of PCA
## 6 $pca.norm 31 scaling vector of PCA
## 7 $pca.eig 25 eigenvalues of PCA
##
## data.frame nrow ncol
## 1 $tab 513 12
## 2 $means 7 12
## 3 $loadings 12 6
## 4 $ind.coord 513 6
## 5 $grp.coord 7 6
## 6 $posterior 513 7
## 7 $pca.loadings 31 12
## 8 $var.contr 31 6
## content
## 1 retained PCs of PCA
## 2 group means
## 3 loadings of variables
## 4 coordinates of individuals (principal components)
## 5 coordinates of groups
## 6 posterior membership probabilities
## 7 PCA loadings of original variables
## 8 contribution of original variables
setpop(ramdat) <- ~ZONE2
(z2.dapc <- dapc(ramdat, n.pca = 12, n.da = 6))
## #################################################
## # Discriminant Analysis of Principal Components #
## #################################################
## class: dapc
## $call: dapc.genind(x = ramdat, n.pca = 12, n.da = 6)
##
## $n.pca: 12 first PCs of PCA used
## $n.da: 6 discriminant functions saved
## $var (proportion of conserved variance): 0.976
##
## $eig (eigenvalues): 2022 110.1 57.13 49.19 17.73 ...
##
## vector length content
## 1 $eig 6 eigenvalues
## 2 $grp 513 prior group assignment
## 3 $prior 7 prior group probabilities
## 4 $assign 513 posterior group assignment
## 5 $pca.cent 31 centring vector of PCA
## 6 $pca.norm 31 scaling vector of PCA
## 7 $pca.eig 25 eigenvalues of PCA
##
## data.frame nrow ncol
## 1 $tab 513 12
## 2 $means 7 12
## 3 $loadings 12 6
## 4 $ind.coord 513 6
## 5 $grp.coord 7 6
## 6 $posterior 513 7
## 7 $pca.loadings 31 12
## 8 $var.contr 31 6
## content
## 1 retained PCs of PCA
## 2 group means
## 3 loadings of variables
## 4 coordinates of individuals (principal components)
## 5 coordinates of groups
## 6 posterior membership probabilities
## 7 PCA loadings of original variables
## 8 contribution of original variables
loadingplot(z2.dapc$var.contr, axis = 1)
loadingplot(z2.dapc$var.contr, axis = 2)
png("loading_figure.png", width = 183, height = 88, units = "mm", res = 300)
loadingplot(z2.dapc$var.contr, axis = 1)
dev.off()
## pdf
## 2
popPal <- char2pal(ramdat@pop.names)
scatter(z2.dapc, cex = 2, legend = TRUE, clabel = FALSE, col = popPal,
posi.leg = "bottomleft", scree.pca = TRUE, posi.pca = "topright",
posi.da = "bottomright", cleg = 0.75, xax = 1, yax = 2, inset.solid = 1,
ratio.pca = 0.2, ratio.da = 0.2)
png("no_nursery_DAPC.png", width = 183, height = 183, units = "mm", res = 300)
scatter(z2.dapc, cex = 2, legend = TRUE, clabel = FALSE, col = popPal,
posi.leg = "bottomright", scree.pca = TRUE, posi.pca = "topright",
posi.da = "bottomleft", cleg = 0.75, xax = 1, yax = 2, inset.solid = 1,
ratio.pca = 0.2, ratio.da = 0.2)
dev.off()
## pdf
## 2
plot_posterior(z2.dapc, ramdat, popPal)
Here, I am removing the Cape Sebastian isolates as well as the populations with less than 10 isolates.
noseb <- popsub(ramdat, blacklist = "HunterCr", drop = FALSE)
noseb.gt.10 <- selPopSize(noseb, n = 10)
# save(noseb.gt.10, file = "nosebpr.rda")
noseb.xval <- xvalDapc(noseb.gt.10@tab, pop(noseb.gt.10), n.pca.max = 15,
result = "overall", n.rep = 1000)
## NULL
noseb.xval[-1]
## $`Median and Confidence Interval for Random Chance`
## 2.5% 50% 97.5%
## 0.1633 0.2004 0.2350
##
## $`Mean Successful Assignment by Number of PCs of PCA`
## 1 2 3 4 5 6 7 8 9 10
## 0.6048 0.6281 0.6700 0.6952 0.7389 0.7643 0.7667 0.7690 0.7594 0.7642
## 11 12 13 14 15
## 0.7829 0.7826 0.7808 0.7842 0.7855
##
## $`Number of PCs Achieving Highest Mean Success`
## [1] "15"
##
## $`Root Mean Squared Error by Number of PCs of PCA`
## 1 2 3 4 5 6 7 8 9 10
## 0.3965 0.3760 0.3346 0.3115 0.2666 0.2420 0.2396 0.2377 0.2469 0.2421
## 11 12 13 14 15
## 0.2237 0.2237 0.2261 0.2225 0.2214
##
## $`Number of PCs Achieving Lowest MSE`
## [1] "15"
##
## $DAPC
## #################################################
## # Discriminant Analysis of Principal Components #
## #################################################
## class: dapc
## $call: dapc.data.frame(x = as.data.frame(x), grp = ..1, n.pca = ..2,
## n.da = ..3)
##
## $n.pca: 15 first PCs of PCA used
## $n.da: 4 discriminant functions saved
## $var (proportion of conserved variance): 0.987
##
## $eig (eigenvalues): 185.4 99.69 66.05 29.63 vector length content
## 1 $eig 4 eigenvalues
## 2 $grp 443 prior group assignment
## 3 $prior 5 prior group probabilities
## 4 $assign 443 posterior group assignment
## 5 $pca.cent 31 centring vector of PCA
## 6 $pca.norm 31 scaling vector of PCA
## 7 $pca.eig 24 eigenvalues of PCA
##
## data.frame nrow ncol
## 1 $tab 443 15
## 2 $means 5 15
## 3 $loadings 15 4
## 4 $ind.coord 443 4
## 5 $grp.coord 5 4
## 6 $posterior 443 5
## 7 $pca.loadings 31 15
## 8 $var.contr 31 4
## content
## 1 retained PCs of PCA
## 2 group means
## 3 loadings of variables
## 4 coordinates of individuals (principal components)
## 5 coordinates of groups
## 6 posterior membership probabilities
## 7 PCA loadings of original variables
## 8 contribution of original variables
(noseb.dapc <- dapc(noseb.gt.10, n.pca = 15, n.da = 4))
## #################################################
## # Discriminant Analysis of Principal Components #
## #################################################
## class: dapc
## $call: dapc.genind(x = noseb.gt.10, n.pca = 15, n.da = 4)
##
## $n.pca: 15 first PCs of PCA used
## $n.da: 4 discriminant functions saved
## $var (proportion of conserved variance): 0.987
##
## $eig (eigenvalues): 185.4 99.69 66.05 29.63 vector length content
## 1 $eig 4 eigenvalues
## 2 $grp 443 prior group assignment
## 3 $prior 5 prior group probabilities
## 4 $assign 443 posterior group assignment
## 5 $pca.cent 31 centring vector of PCA
## 6 $pca.norm 31 scaling vector of PCA
## 7 $pca.eig 24 eigenvalues of PCA
##
## data.frame nrow ncol
## 1 $tab 443 15
## 2 $means 5 15
## 3 $loadings 15 4
## 4 $ind.coord 443 4
## 5 $grp.coord 5 4
## 6 $posterior 443 5
## 7 $pca.loadings 31 15
## 8 $var.contr 31 4
## content
## 1 retained PCs of PCA
## 2 group means
## 3 loadings of variables
## 4 coordinates of individuals (principal components)
## 5 coordinates of groups
## 6 posterior membership probabilities
## 7 PCA loadings of original variables
## 8 contribution of original variables
scatter(noseb.dapc, cex = 2, legend = TRUE, clabel = FALSE,
col = popPal[noseb.gt.10@pop.names], scree.pca = TRUE,
posi.leg = "bottomleft", posi.pca = "topleft",
cleg = 0.75)
plot_posterior(noseb.dapc, noseb.gt.10, popPal)
Since we now have a model from our well represented populations, we can use this to predict the sources of individuals from these populations. Let’s see if they match up
to_test <- noseb@pop.names[!noseb@pop.names %in% noseb.gt.10@pop.names]
testpop <- popsub(noseb, to_test, drop = FALSE)
predictions <- predict.dapc(noseb.dapc, newdata = testpop)
plot_posterior(predictions, testpop, popPal)
to_test <- ramdat@pop.names[!ramdat@pop.names %in% noseb.gt.10@pop.names]
testpop <- popsub(ramdat, to_test, drop = FALSE)
predictions <- predict.dapc(noseb.dapc, newdata = testpop)
plot_posterior(predictions, testpop, popPal)
Using 95% inertia ellipses.
scatter(noseb.dapc, cex = 2, legend = TRUE, clabel = FALSE, cellipse = 2.5,
col = popPal[noseb.gt.10@pop.names], scree.pca = FALSE,
posi.leg = "bottomleft", posi.pca = "topleft",
cleg = 0.75, bg="white", scree.da=0, pch=20,
xlim = c(-3, 7), ylim = c(-3, 3.5), solid = 0.5)
par(xpd=TRUE)
# points(noseb.dapc$ind.coord[, 1], noseb.dapc$ind.coord[, 2], pch=20,
# col = transp(popPal[noseb.gt.10@pop.names]), cex = 2)
points(predictions$ind.scores[, 1], predictions$ind.scores[, 2],
pch = 20, col = transp("black", 0.2), cex = 3)
points(predictions$ind.scores[, 1], predictions$ind.scores[, 2],
pch = as.character(pop(testpop)),
col = transp(popPal[as.character(pop(testpop))], 0.7), cex = 1)
add.scatter.eig(noseb.dapc$eig, 15, 1, 2, posi="bottomright", inset=.02)
sebpredmat <- matrix(c(
rowMeans(apply(predictions$posterior[1:66, ], 1, function(x) x >= 0.5)),
rowMeans(apply(predictions$posterior[1:66, ], 1, function(x) x >= 0.95)),
rowMeans(apply(predictions$posterior[1:66, ], 1, function(x) x >= 0.99)),
rowMeans(apply(predictions$posterior[1:66, ], 1, function(x) x >= 0.999))),
ncol = 4)
dimnames(sebpredmat) <- list(Population = colnames(predictions$posterior),
`membership probability` = c("50%", "95%", "99%",
"99.9%"))
signif(sebpredmat, 3)
## membership probability
## Population 50% 95% 99% 99.9%
## JHallCr 0 0 0.000 0.000
## NFChetHigh 0 0 0.000 0.000
## Coast 1 1 0.985 0.909
## Winchuck 0 0 0.000 0.000
## ChetcoMain 0 0 0.000 0.000